import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
data=pd.read_csv("PEP1.csv")
data.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
data.shape
(1460, 81)
data.columns
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchebvGr', 'KitchenQual', 'TotRmsAbvGrd',
'Functiol', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal',
'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice'],
dtype='object')
df=data.columns
data[df].nunique()
Id 1460
MSSubClass 15
MSZoning 5
LotFrontage 110
LotArea 1073
...
MoSold 12
YrSold 5
SaleType 9
SaleCondition 6
SalePrice 663
Length: 81, dtype: int64
data_num=data.select_dtypes(include=np.number)
data_num
| Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | 65.0 | 8450 | 7 | 5 | 2003 | 2003 | 196.0 | 706 | ... | 0 | 61 | 0 | 0 | 0 | 0 | 0 | 2 | 2008 | 208500 |
| 1 | 2 | 20 | 80.0 | 9600 | 6 | 8 | 1976 | 1976 | 0.0 | 978 | ... | 298 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2007 | 181500 |
| 2 | 3 | 60 | 68.0 | 11250 | 7 | 5 | 2001 | 2002 | 162.0 | 486 | ... | 0 | 42 | 0 | 0 | 0 | 0 | 0 | 9 | 2008 | 223500 |
| 3 | 4 | 70 | 60.0 | 9550 | 7 | 5 | 1915 | 1970 | 0.0 | 216 | ... | 0 | 35 | 272 | 0 | 0 | 0 | 0 | 2 | 2006 | 140000 |
| 4 | 5 | 60 | 84.0 | 14260 | 8 | 5 | 2000 | 2000 | 350.0 | 655 | ... | 192 | 84 | 0 | 0 | 0 | 0 | 0 | 12 | 2008 | 250000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | 1456 | 60 | 62.0 | 7917 | 6 | 5 | 1999 | 2000 | 0.0 | 0 | ... | 0 | 40 | 0 | 0 | 0 | 0 | 0 | 8 | 2007 | 175000 |
| 1456 | 1457 | 20 | 85.0 | 13175 | 6 | 6 | 1978 | 1988 | 119.0 | 790 | ... | 349 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2010 | 210000 |
| 1457 | 1458 | 70 | 66.0 | 9042 | 7 | 9 | 1941 | 2006 | 0.0 | 275 | ... | 0 | 60 | 0 | 0 | 0 | 0 | 2500 | 5 | 2010 | 266500 |
| 1458 | 1459 | 20 | 68.0 | 9717 | 5 | 6 | 1950 | 1996 | 0.0 | 49 | ... | 366 | 0 | 112 | 0 | 0 | 0 | 0 | 4 | 2010 | 142125 |
| 1459 | 1460 | 20 | 75.0 | 9937 | 5 | 6 | 1965 | 1965 | 0.0 | 830 | ... | 736 | 68 | 0 | 0 | 0 | 0 | 0 | 6 | 2008 | 147500 |
1460 rows × 38 columns
data_num.isnull().sum()
Id 0 MSSubClass 0 LotFrontage 259 LotArea 0 OverallQual 0 OverallCond 0 YearBuilt 0 YearRemodAdd 0 MasVnrArea 8 BsmtFinSF1 0 BsmtFinSF2 0 BsmtUnfSF 0 TotalBsmtSF 0 1stFlrSF 0 2ndFlrSF 0 LowQualFinSF 0 GrLivArea 0 BsmtFullBath 0 BsmtHalfBath 0 FullBath 0 HalfBath 0 BedroomAbvGr 0 KitchebvGr 0 TotRmsAbvGrd 0 Fireplaces 0 GarageYrBlt 81 GarageCars 0 GarageArea 0 WoodDeckSF 0 OpenPorchSF 0 EnclosedPorch 0 3SsnPorch 0 ScreenPorch 0 PoolArea 0 MiscVal 0 MoSold 0 YrSold 0 SalePrice 0 dtype: int64
data_num['LotFrontage'].fillna(data_num['LotFrontage'].median(),inplace=True)
data_num['GarageYrBlt'].fillna(data_num['GarageYrBlt'].median(),inplace=True)
data_num.isnull().sum()
Id 0 MSSubClass 0 LotFrontage 0 LotArea 0 OverallQual 0 OverallCond 0 YearBuilt 0 YearRemodAdd 0 MasVnrArea 8 BsmtFinSF1 0 BsmtFinSF2 0 BsmtUnfSF 0 TotalBsmtSF 0 1stFlrSF 0 2ndFlrSF 0 LowQualFinSF 0 GrLivArea 0 BsmtFullBath 0 BsmtHalfBath 0 FullBath 0 HalfBath 0 BedroomAbvGr 0 KitchebvGr 0 TotRmsAbvGrd 0 Fireplaces 0 GarageYrBlt 0 GarageCars 0 GarageArea 0 WoodDeckSF 0 OpenPorchSF 0 EnclosedPorch 0 3SsnPorch 0 ScreenPorch 0 PoolArea 0 MiscVal 0 MoSold 0 YrSold 0 SalePrice 0 dtype: int64
data_num.dropna(inplace=True)
data_num.isnull().sum()
Id 0 MSSubClass 0 LotFrontage 0 LotArea 0 OverallQual 0 OverallCond 0 YearBuilt 0 YearRemodAdd 0 MasVnrArea 0 BsmtFinSF1 0 BsmtFinSF2 0 BsmtUnfSF 0 TotalBsmtSF 0 1stFlrSF 0 2ndFlrSF 0 LowQualFinSF 0 GrLivArea 0 BsmtFullBath 0 BsmtHalfBath 0 FullBath 0 HalfBath 0 BedroomAbvGr 0 KitchebvGr 0 TotRmsAbvGrd 0 Fireplaces 0 GarageYrBlt 0 GarageCars 0 GarageArea 0 WoodDeckSF 0 OpenPorchSF 0 EnclosedPorch 0 3SsnPorch 0 ScreenPorch 0 PoolArea 0 MiscVal 0 MoSold 0 YrSold 0 SalePrice 0 dtype: int64
# Identify the skewness and distribution
data_num["LotFrontage"].plot(kind="kde")
<AxesSubplot:ylabel='Density'>
data_num.columns
Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchebvGr', 'TotRmsAbvGrd', 'Fireplaces',
'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
'MoSold', 'YrSold', 'SalePrice'],
dtype='object')
num_col=['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchebvGr', 'TotRmsAbvGrd', 'Fireplaces',
'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
'MoSold', 'YrSold', 'SalePrice']
# Correlation Matrix formation
corr_matrix =data_num.loc[:,num_col].corr()
print(corr_matrix)
Id MSSubClass LotFrontage LotArea OverallQual \
Id 1.000000 0.011687 -0.011431 -0.032844 -0.032883
MSSubClass 0.011687 1.000000 -0.354975 -0.138054 0.034491
LotFrontage -0.011431 -0.354975 1.000000 0.304684 0.233147
LotArea -0.032844 -0.138054 0.304684 1.000000 0.106324
OverallQual -0.032883 0.034491 0.233147 0.106324 1.000000
OverallCond 0.013029 -0.061330 -0.053072 -0.002269 -0.090628
YearBuilt -0.015129 0.028397 0.116424 0.015639 0.571111
YearRemodAdd -0.024224 0.041047 0.082958 0.015126 0.549573
MasVnrArea -0.050298 0.022936 0.179459 0.104160 0.411876
BsmtFinSF1 -0.007242 -0.069575 0.215610 0.213063 0.236823
BsmtFinSF2 -0.005516 -0.066137 0.042781 0.111686 -0.058039
BsmtUnfSF -0.008274 -0.138922 0.121872 -0.004227 0.309602
TotalBsmtSF -0.017912 -0.236906 0.362862 0.258409 0.537122
1stFlrSF 0.008684 -0.250050 0.414458 0.295919 0.476936
2ndFlrSF 0.007333 0.308104 0.073386 0.052935 0.298543
LowQualFinSF -0.044125 0.046413 0.037645 0.004904 -0.029998
GrLivArea 0.008356 0.076930 0.368004 0.261159 0.594417
BsmtFullBath 0.001030 0.003807 0.091245 0.157702 0.108505
BsmtHalfBath -0.019809 -0.002633 -0.006822 0.048377 -0.039207
FullBath 0.005673 0.136306 0.179172 0.122457 0.552266
HalfBath 0.005652 0.176165 0.047926 0.016290 0.271466
BedroomAbvGr 0.041511 -0.021651 0.236915 0.117778 0.105900
KitchebvGr 0.004806 0.286572 -0.004674 -0.024697 -0.184642
TotRmsAbvGrd 0.029185 0.042406 0.320213 0.187990 0.430549
Fireplaces -0.017536 -0.044466 0.233661 0.269643 0.400398
GarageYrBlt -0.002362 0.082333 0.062494 -0.025139 0.512611
GarageCars 0.014997 -0.039043 0.268705 0.154739 0.599734
GarageArea 0.015399 -0.098141 0.323132 0.180778 0.560543
WoodDeckSF -0.028146 -0.012634 0.074498 0.173167 0.240652
OpenPorchSF -0.006176 -0.005462 0.134672 0.086301 0.303482
EnclosedPorch 0.004554 -0.010571 0.010692 -0.023094 -0.112950
3SsnPorch -0.046541 -0.044049 0.062176 0.020574 0.031029
ScreenPorch 0.001769 -0.026414 0.037946 0.043511 0.066403
PoolArea 0.057268 0.008214 0.181292 0.077888 0.065743
MiscVal -0.006114 -0.007805 -0.000192 0.038226 -0.031129
MoSold 0.018962 -0.013840 0.009735 0.003203 0.068760
YrSold 0.002776 -0.021529 0.007324 -0.012977 -0.025186
SalePrice -0.025343 -0.082813 0.333184 0.264674 0.789997
OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 \
Id 0.013029 -0.015129 -0.024224 -0.050298 -0.007242
MSSubClass -0.061330 0.028397 0.041047 0.022936 -0.069575
LotFrontage -0.053072 0.116424 0.082958 0.179459 0.215610
LotArea -0.002269 0.015639 0.015126 0.104160 0.213063
OverallQual -0.090628 0.571111 0.549573 0.411876 0.236823
OverallCond 1.000000 -0.376763 0.075121 -0.128101 -0.041927
YearBuilt -0.376763 1.000000 0.590674 0.315707 0.249239
YearRemodAdd 0.075121 0.590674 1.000000 0.179618 0.127609
MasVnrArea -0.128101 0.315707 0.179618 1.000000 0.264736
BsmtFinSF1 -0.041927 0.249239 0.127609 0.264736 1.000000
BsmtFinSF2 0.039333 -0.047816 -0.066672 -0.072319 -0.049287
BsmtUnfSF -0.136934 0.149810 0.181828 0.114442 -0.496137
TotalBsmtSF -0.167230 0.392562 0.291492 0.363936 0.520533
1stFlrSF -0.138814 0.284570 0.242488 0.344501 0.443232
2ndFlrSF 0.027473 0.009566 0.140225 0.174561 -0.135715
LowQualFinSF 0.025140 -0.183749 -0.062045 -0.069071 -0.064345
GrLivArea -0.076541 0.199343 0.288279 0.390857 0.206027
BsmtFullBath -0.051567 0.186305 0.118169 0.085310 0.647346
BsmtHalfBath 0.117290 -0.037072 -0.011312 0.026673 0.068611
FullBath -0.190396 0.469625 0.440329 0.276833 0.055808
HalfBath -0.061434 0.240417 0.181063 0.201444 0.001952
BedroomAbvGr 0.014274 -0.068619 -0.038429 0.102821 -0.105691
KitchebvGr -0.081254 -0.173951 -0.148527 -0.037610 -0.086473
TotRmsAbvGrd -0.055964 0.097440 0.193988 0.280682 0.044074
Fireplaces -0.020120 0.150148 0.114806 0.249070 0.258300
GarageYrBlt -0.306149 0.775884 0.614468 0.248346 0.147402
GarageCars -0.184866 0.537492 0.419815 0.364204 0.222241
GarageArea -0.151062 0.478439 0.370674 0.373066 0.295493
WoodDeckSF -0.004530 0.226891 0.207464 0.159718 0.205350
OpenPorchSF -0.031172 0.185081 0.223491 0.125703 0.107696
EnclosedPorch 0.074731 -0.386839 -0.192367 -0.110204 -0.105608
3SsnPorch 0.025163 0.032037 0.045907 0.018796 0.026995
ScreenPorch 0.054016 -0.049169 -0.037656 0.061466 0.063299
PoolArea -0.002229 0.005310 0.006145 0.011723 0.141361
MiscVal 0.068642 -0.034048 -0.009927 -0.029815 0.003910
MoSold -0.004034 0.009362 0.018588 -0.005965 -0.016053
YrSold 0.043433 -0.014441 0.035352 -0.008201 0.016870
SalePrice -0.076294 0.522896 0.507158 0.477493 0.383977
... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \
Id ... -0.028146 -0.006176 0.004554 -0.046541
MSSubClass ... -0.012634 -0.005462 -0.010571 -0.044049
LotFrontage ... 0.074498 0.134672 0.010692 0.062176
LotArea ... 0.173167 0.086301 -0.023094 0.020574
OverallQual ... 0.240652 0.303482 -0.112950 0.031029
OverallCond ... -0.004530 -0.031172 0.074731 0.025163
YearBuilt ... 0.226891 0.185081 -0.386839 0.032037
YearRemodAdd ... 0.207464 0.223491 -0.192367 0.045907
MasVnrArea ... 0.159718 0.125703 -0.110204 0.018796
BsmtFinSF1 ... 0.205350 0.107696 -0.105608 0.026995
BsmtFinSF2 ... 0.067673 0.004294 0.036749 -0.030186
BsmtUnfSF ... -0.004192 0.130217 -0.003684 0.020857
TotalBsmtSF ... 0.234182 0.244914 -0.099915 0.037960
1stFlrSF ... 0.238699 0.210625 -0.072610 0.056901
2ndFlrSF ... 0.090962 0.210512 0.064217 -0.024422
LowQualFinSF ... -0.025669 0.018852 0.061314 -0.004373
GrLivArea ... 0.247981 0.330795 0.005813 0.021000
BsmtFullBath ... 0.175778 0.063937 -0.051483 0.000296
BsmtHalfBath ... 0.039929 -0.024489 -0.008518 0.034966
FullBath ... 0.189982 0.261509 -0.120246 0.036004
HalfBath ... 0.107275 0.196968 -0.093258 -0.004679
BedroomAbvGr ... 0.045614 0.098687 0.038447 -0.024667
KitchebvGr ... -0.088863 -0.067892 0.028587 -0.024534
TotRmsAbvGrd ... 0.165236 0.237234 0.000861 -0.006657
Fireplaces ... 0.198180 0.170942 -0.029461 0.011447
GarageYrBlt ... 0.221241 0.214768 -0.284534 0.023803
GarageCars ... 0.226669 0.211257 -0.151857 0.036116
GarageArea ... 0.225418 0.238895 -0.121603 0.035410
WoodDeckSF ... 1.000000 0.058911 -0.125486 -0.033008
OpenPorchSF ... 0.058911 1.000000 -0.090870 -0.005401
EnclosedPorch ... -0.125486 -0.090870 1.000000 -0.037395
3SsnPorch ... -0.033008 -0.005401 -0.037395 1.000000
ScreenPorch ... -0.074740 0.075865 -0.083074 -0.031617
PoolArea ... 0.073454 0.061403 0.054397 -0.008036
MiscVal ... -0.009694 -0.018335 0.018445 0.000298
MoSold ... 0.021789 0.068538 -0.025830 0.029761
YrSold ... 0.021575 -0.055585 -0.008496 0.018714
SalePrice ... 0.324650 0.311268 -0.128778 0.045247
ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
Id 0.001769 0.057268 -0.006114 0.018962 0.002776 -0.025343
MSSubClass -0.026414 0.008214 -0.007805 -0.013840 -0.021529 -0.082813
LotFrontage 0.037946 0.181292 -0.000192 0.009735 0.007324 0.333184
LotArea 0.043511 0.077888 0.038226 0.003203 -0.012977 0.264674
OverallQual 0.066403 0.065743 -0.031129 0.068760 -0.025186 0.789997
OverallCond 0.054016 -0.002229 0.068642 -0.004034 0.043433 -0.076294
YearBuilt -0.049169 0.005310 -0.034048 0.009362 -0.014441 0.522896
YearRemodAdd -0.037656 0.006145 -0.009927 0.018588 0.035352 0.507158
MasVnrArea 0.061466 0.011723 -0.029815 -0.005965 -0.008201 0.477493
BsmtFinSF1 0.063299 0.141361 0.003910 -0.016053 0.016870 0.383977
BsmtFinSF2 0.088480 0.041610 0.004802 -0.014878 0.031851 -0.010316
BsmtUnfSF -0.012506 -0.035146 -0.023857 0.033432 -0.040377 0.215740
TotalBsmtSF 0.085831 0.126820 -0.018237 0.011558 -0.011451 0.612971
1stFlrSF 0.090338 0.132669 -0.020931 0.031148 -0.009063 0.606849
2ndFlrSF 0.040771 0.081749 0.016257 0.039782 -0.031893 0.322710
LowQualFinSF 0.026627 0.062115 -0.003851 -0.022102 -0.028954 -0.025263
GrLivArea 0.102489 0.170808 -0.002192 0.053792 -0.035801 0.710080
BsmtFullBath 0.024157 0.068057 -0.022813 -0.024940 0.067489 0.225027
BsmtHalfBath 0.031774 0.019937 -0.007484 0.033352 -0.046571 -0.015993
FullBath -0.006959 0.050103 -0.013964 0.058944 -0.019985 0.562491
HalfBath 0.073391 0.022636 0.001528 -0.008772 -0.010056 0.282040
BedroomAbvGr 0.044270 0.070928 0.007728 0.052450 -0.038584 0.171934
KitchebvGr -0.051430 -0.014485 0.062926 0.031032 0.033943 -0.137419
TotRmsAbvGrd 0.059632 0.083979 0.024853 0.041611 -0.034886 0.536311
Fireplaces 0.185752 0.095602 0.001518 0.052030 -0.024917 0.468930
GarageYrBlt -0.075017 -0.014421 -0.031418 0.002013 -0.001371 0.466247
GarageCars 0.051277 0.021140 -0.042900 0.039393 -0.038065 0.639686
GarageArea 0.052130 0.061292 -0.027230 0.026719 -0.025754 0.622492
WoodDeckSF -0.074740 0.073454 -0.009694 0.021789 0.021575 0.324650
OpenPorchSF 0.075865 0.061403 -0.018335 0.068538 -0.055585 0.311268
EnclosedPorch -0.083074 0.054397 0.018445 -0.025830 -0.008496 -0.128778
3SsnPorch -0.031617 -0.008036 0.000298 0.029761 0.018714 0.045247
ScreenPorch 1.000000 0.051216 0.031822 0.023695 0.010786 0.113044
PoolArea 0.051216 1.000000 0.029636 -0.033785 -0.059800 0.093109
MiscVal 0.031822 0.029636 1.000000 -0.006400 0.004938 -0.020951
MoSold 0.023695 -0.033785 -0.006400 1.000000 -0.145367 0.045136
YrSold 0.010786 -0.059800 0.004938 -0.145367 1.000000 -0.026180
SalePrice 0.113044 0.093109 -0.020951 0.045136 -0.026180 1.000000
[38 rows x 38 columns]
#Using heatmap to visualize the correlation matrix
sns.heatmap(corr_matrix, annot=True)
<AxesSubplot:>
# Pair plot for distribution and density
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x1a917590b20>
sns.pairplot(data[['LotFrontage', 'SalePrice']])
<seaborn.axisgrid.PairGrid at 0x1a959a82fd0>
data_cat=data.select_dtypes(exclude=np.number)
data_cat
| MSZoning | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | ... | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 1 | RL | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 2 | RL | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 3 | RL | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | ... | Detchd | Unf | TA | TA | Y | NaN | NaN | NaN | WD | Abnorml |
| 4 | RL | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 1456 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NWAmes | Norm | ... | Attchd | Unf | TA | TA | Y | NaN | MnPrv | NaN | WD | Normal |
| 1457 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Crawfor | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | GdPrv | Shed | WD | Normal |
| 1458 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | mes | Norm | ... | Attchd | Unf | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 1459 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Edwards | Norm | ... | Attchd | Fin | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
1460 rows × 43 columns
data_cat.isnull().sum()
MSZoning 0 Street 0 Alley 1369 LotShape 0 LandContour 0 Utilities 0 LotConfig 0 LandSlope 0 Neighborhood 0 Condition1 0 Condition2 0 BldgType 0 HouseStyle 0 RoofStyle 0 RoofMatl 0 Exterior1st 0 Exterior2nd 0 MasVnrType 8 ExterQual 0 ExterCond 0 Foundation 0 BsmtQual 37 BsmtCond 37 BsmtExposure 38 BsmtFinType1 37 BsmtFinType2 38 Heating 0 HeatingQC 0 CentralAir 0 Electrical 1 KitchenQual 0 Functiol 0 FireplaceQu 690 GarageType 81 GarageFinish 81 GarageQual 81 GarageCond 81 PavedDrive 0 PoolQC 1453 Fence 1179 MiscFeature 1406 SaleType 0 SaleCondition 0 dtype: int64
data_cat['Alley'].mode()
0 Grvl Name: Alley, dtype: object
data_cat['Alley'].fillna(data_cat['Alley'].mode()[0],inplace=True)
data_cat['MiscFeature'].mode()
0 Shed Name: MiscFeature, dtype: object
data_cat['MiscFeature'].fillna(data_cat['MiscFeature'].mode()[0],inplace=True)
data_cat['Fence'].mode()
0 MnPrv Name: Fence, dtype: object
data_cat['Fence'].fillna(data_cat['Fence'].mode()[0],inplace=True)
data_cat['PoolQC'].mode()
0 Gd Name: PoolQC, dtype: object
data_cat['PoolQC'].fillna(data_cat['PoolQC'].mode()[0],inplace=True)
data_cat['FireplaceQu'].mode()
0 Gd Name: FireplaceQu, dtype: object
data_cat['FireplaceQu'].fillna(data_cat['FireplaceQu'].mode()[0],inplace=True)
data_cat['GarageType'].mode()
0 Attchd Name: GarageType, dtype: object
data_cat['GarageType'].fillna(data_cat['GarageType'].mode()[0],inplace=True)
data_cat['GarageFinish'].mode()
0 Unf Name: GarageFinish, dtype: object
data_cat['GarageFinish'].fillna(data_cat['GarageFinish'].mode()[0],inplace=True)
data_cat['GarageQual'].mode()
0 TA Name: GarageQual, dtype: object
data_cat['GarageQual'].fillna(data_cat['GarageQual'].mode()[0],inplace=True)
data_cat['GarageCond'].mode()
0 TA Name: GarageCond, dtype: object
data_cat['GarageCond'].fillna(data_cat['GarageCond'].mode()[0],inplace=True)
data_cat.isnull().sum()
MSZoning 0 Street 0 Alley 0 LotShape 0 LandContour 0 Utilities 0 LotConfig 0 LandSlope 0 Neighborhood 0 Condition1 0 Condition2 0 BldgType 0 HouseStyle 0 RoofStyle 0 RoofMatl 0 Exterior1st 0 Exterior2nd 0 MasVnrType 8 ExterQual 0 ExterCond 0 Foundation 0 BsmtQual 37 BsmtCond 37 BsmtExposure 38 BsmtFinType1 37 BsmtFinType2 38 Heating 0 HeatingQC 0 CentralAir 0 Electrical 1 KitchenQual 0 Functiol 0 FireplaceQu 0 GarageType 0 GarageFinish 0 GarageQual 0 GarageCond 0 PavedDrive 0 PoolQC 0 Fence 0 MiscFeature 0 SaleType 0 SaleCondition 0 dtype: int64
data_cat.dropna(inplace=True)
data_cat.isnull().sum()
MSZoning 0 Street 0 Alley 0 LotShape 0 LandContour 0 Utilities 0 LotConfig 0 LandSlope 0 Neighborhood 0 Condition1 0 Condition2 0 BldgType 0 HouseStyle 0 RoofStyle 0 RoofMatl 0 Exterior1st 0 Exterior2nd 0 MasVnrType 0 ExterQual 0 ExterCond 0 Foundation 0 BsmtQual 0 BsmtCond 0 BsmtExposure 0 BsmtFinType1 0 BsmtFinType2 0 Heating 0 HeatingQC 0 CentralAir 0 Electrical 0 KitchenQual 0 Functiol 0 FireplaceQu 0 GarageType 0 GarageFinish 0 GarageQual 0 GarageCond 0 PavedDrive 0 PoolQC 0 Fence 0 MiscFeature 0 SaleType 0 SaleCondition 0 dtype: int64
sns.boxplot(data=data, x="SalePrice", y="SaleCondition")
<AxesSubplot:xlabel='SalePrice', ylabel='SaleCondition'>
sns.countplot(x=data["LotShape"],hue=data['LotShape'])
plt.title("Feature Engineering")
Text(0.5, 1.0, 'Feature Engineering')
concat_df=pd.concat([data_cat,data_num])
concat_df
| MSZoning | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Pave | Grvl | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | RL | Pave | Grvl | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | RL | Pave | Grvl | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | RL | Pave | Grvl | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | RL | Pave | Grvl | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.0 | 40.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 8.0 | 2007.0 | 175000.0 |
| 1456 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 349.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2010.0 | 210000.0 |
| 1457 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.0 | 60.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2500.0 | 5.0 | 2010.0 | 266500.0 |
| 1458 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 366.0 | 0.0 | 112.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 2010.0 | 142125.0 |
| 1459 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 736.0 | 68.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 2008.0 | 147500.0 |
2864 rows × 81 columns
sns.boxplot(data=concat_df, x="SalePrice")
<AxesSubplot:xlabel='SalePrice'>